{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "31067bb0-6fcb-493a-aef2-b6ddacf45a1d",
   "metadata": {},
   "source": [
    "# Sentence Window Retrieval Pack\n",
    "\n",
    "This LlamaPack provides an example of our sentence window retriever."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b11fbbd4-4798-4051-a5b4-187a05b369b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import nest_asyncio\n",
    "\n",
    "nest_asyncio.apply()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "abc964cd-e52e-4886-b6f2-839d34d0920e",
   "metadata": {},
   "source": [
    "### Setup Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "385a896e-a7e2-4a56-8352-9a06bf3b47d9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2023-11-26 19:47:32--  https://www.dropbox.com/s/f6bmb19xdg0xedm/paul_graham_essay.txt?dl=1\n",
      "Resolving www.dropbox.com (www.dropbox.com)... 2620:100:6017:18::a27d:212, 162.125.13.18\n",
      "Connecting to www.dropbox.com (www.dropbox.com)|2620:100:6017:18::a27d:212|:443... connected.\n",
      "HTTP request sent, awaiting response... 302 Found\n",
      "Location: /s/dl/f6bmb19xdg0xedm/paul_graham_essay.txt [following]\n",
      "--2023-11-26 19:47:33--  https://www.dropbox.com/s/dl/f6bmb19xdg0xedm/paul_graham_essay.txt\n",
      "Reusing existing connection to [www.dropbox.com]:443.\n",
      "HTTP request sent, awaiting response... 302 Found\n",
      "Location: https://uc5bd2ddbf6f7adbcc0b0f2b0f70.dl.dropboxusercontent.com/cd/0/get/CISajRC9E-jXCUmjwbMOz-HbrY4pt7I5ZOUNRWUHVA20k-NvzHIOxgUAlyf3LacM0TLbQvw7ZBnZID1s2Zr7gqRhfg-xMXwO4YO-c6UswKydEBd5avPneT1AydRAutGtr2lavAdm7-9FM7c5iMqDHJD1/file?dl=1# [following]\n",
      "--2023-11-26 19:47:33--  https://uc5bd2ddbf6f7adbcc0b0f2b0f70.dl.dropboxusercontent.com/cd/0/get/CISajRC9E-jXCUmjwbMOz-HbrY4pt7I5ZOUNRWUHVA20k-NvzHIOxgUAlyf3LacM0TLbQvw7ZBnZID1s2Zr7gqRhfg-xMXwO4YO-c6UswKydEBd5avPneT1AydRAutGtr2lavAdm7-9FM7c5iMqDHJD1/file?dl=1\n",
      "Resolving uc5bd2ddbf6f7adbcc0b0f2b0f70.dl.dropboxusercontent.com (uc5bd2ddbf6f7adbcc0b0f2b0f70.dl.dropboxusercontent.com)... 2620:100:6057:15::a27d:d0f, 162.125.13.15\n",
      "Connecting to uc5bd2ddbf6f7adbcc0b0f2b0f70.dl.dropboxusercontent.com (uc5bd2ddbf6f7adbcc0b0f2b0f70.dl.dropboxusercontent.com)|2620:100:6057:15::a27d:d0f|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 75047 (73K) [application/binary]\n",
      "Saving to: ‘paul_graham_essay.txt’\n",
      "\n",
      "paul_graham_essay.t 100%[===================>]  73.29K  --.-KB/s    in 0.02s   \n",
      "\n",
      "2023-11-26 19:47:33 (3.69 MB/s) - ‘paul_graham_essay.txt’ saved [75047/75047]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!wget \"https://www.dropbox.com/s/f6bmb19xdg0xedm/paul_graham_essay.txt?dl=1\" -O paul_graham_essay.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7bee4c6b-c0a1-4c16-a577-ba1d51eb25e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index import SimpleDirectoryReader\n",
    "from llama_index.node_parser import SimpleNodeParser\n",
    "\n",
    "# load in some sample data\n",
    "reader = SimpleDirectoryReader(input_files=[\"paul_graham_essay.txt\"])\n",
    "documents = reader.load_data()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f86afa57-ebb9-41d0-a756-af01acb798bb",
   "metadata": {},
   "source": [
    "### Download and Initialize Pack\n",
    "\n",
    "Note that this pack directly takes in the html file, no need to load it beforehand."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "85f0f103-edb0-45ce-a85d-64519df38176",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.llama_pack import download_llama_pack\n",
    "\n",
    "SentenceWindowRetrieverPack = download_llama_pack(\n",
    "    \"SentenceWindowRetrieverPack\",\n",
    "    \"./sentence_window_retriever_pack\",\n",
    "    # leave the below commented out (was for testing purposes)\n",
    "    # llama_hub_url=\"https://raw.githubusercontent.com/run-llama/llama-hub/jerry/add_llama_packs/llama_hub\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "68189c79-2688-4dfd-a03d-57f4258a1368",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/jerryliu/Programming/llama-hub/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "Downloading config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 571/571 [00:00<00:00, 1.02MB/s]\n",
      "Downloading pytorch_model.bin: 100%|█████████████████████████████████████████████████████████████████████████████████████| 438M/438M [00:09<00:00, 44.7MB/s]\n",
      "Downloading tokenizer_config.json: 100%|███████████████████████████████████████████████████████████████████████████████████| 363/363 [00:00<00:00, 1.83MB/s]\n",
      "Downloading vocab.txt: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 232k/232k [00:00<00:00, 1.80MB/s]\n",
      "Downloading tokenizer.json: 100%|████████████████████████████████████████████████████████████████████████████████████████| 466k/466k [00:00<00:00, 25.7MB/s]\n",
      "Downloading (…)cial_tokens_map.json: 100%|██████████████████████████████████████████████████████████████████████████████████| 239/239 [00:00<00:00, 953kB/s]\n"
     ]
    }
   ],
   "source": [
    "sentence_window_retriever_pack = SentenceWindowRetrieverPack(\n",
    "    documents,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "88cec313-c84c-488a-8fde-759a2d3f7034",
   "metadata": {},
   "source": [
    "### Run Pack"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "8806f58c-db0c-4067-baf5-d917c70a97c5",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
     ]
    }
   ],
   "source": [
    "# this will run the full pack\n",
    "response = sentence_window_retriever_pack.run(\"What did the author do growing up?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "9f5e2126-573d-4e05-9e72-0fd20318212d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The author dropped out of college and taught himself to paint.\n"
     ]
    }
   ],
   "source": [
    "print(str(response))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "34bbbfff-8ecc-4d41-bbe7-138cbcd19265",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(response.source_nodes)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a8874863-88d6-4049-88f1-499697813c22",
   "metadata": {},
   "source": [
    "### Inspect Modules"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "a31c11a0-bb44-4d59-9557-7ac0b06f6cf6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'sentence_index': <llama_index.indices.vector_store.base.VectorStoreIndex at 0x2993fde10>,\n",
       " 'node_parser': SentenceWindowNodeParser(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x287c981f0>, sentence_splitter=<function split_by_sentence_tokenizer.<locals>.split at 0x287bad750>, window_size=3, window_metadata_key='window', original_text_metadata_key='original_text'),\n",
       " 'postprocessor': MetadataReplacementPostProcessor(callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x108e24070>, target_metadata_key='window'),\n",
       " 'llm': OpenAI(callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x108e24070>, model='gpt-3.5-turbo', temperature=0.1, max_tokens=None, additional_kwargs={}, max_retries=3, timeout=60.0, default_headers=None, api_key='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', api_base='https://api.openai.com/v1', api_version=''),\n",
       " 'embed_model': HuggingFaceEmbedding(model_name='sentence-transformers/all-mpnet-base-v2', embed_batch_size=10, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x108e24070>, tokenizer_name='sentence-transformers/all-mpnet-base-v2', max_length=512, pooling=<Pooling.CLS: 'cls'>, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None),\n",
       " 'query_engine': <llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine at 0x2e25eb850>,\n",
       " 'service_context': ServiceContext(llm_predictor=LLMPredictor(system_prompt=None, query_wrapper_prompt=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>), prompt_helper=PromptHelper(context_window=4096, num_output=256, chunk_overlap_ratio=0.1, chunk_size_limit=None, separator=' '), embed_model=HuggingFaceEmbedding(model_name='sentence-transformers/all-mpnet-base-v2', embed_batch_size=10, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x108e24070>, tokenizer_name='sentence-transformers/all-mpnet-base-v2', max_length=512, pooling=<Pooling.CLS: 'cls'>, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None), transformations=[SentenceSplitter(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x108e24070>, chunk_size=1024, chunk_overlap=200, separator=' ', paragraph_separator='\\n\\n\\n', secondary_chunking_regex='[^,.;。？！]+[,.;。？！]?')], llama_logger=<llama_index.logger.base.LlamaLogger object at 0x2b7605900>, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x108e24070>)}"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "modules = sentence_window_retriever_pack.get_modules()\n",
    "display(modules)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a553866-d506-4f42-9d91-82d598e6560f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llama_hub",
   "language": "python",
   "name": "llama_hub"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
